age Age
anaemia Decrease of red blood cells or hemoglobin (boolean)
creatinine_phosphokinase Level of the CPK enzyme in the blood (mcg/L)
diabetes If the patient has diabetes (boolean)
ejection_fraction Percentage of blood leaving the heart at each contraction (percentage)
high_blood_pressure If the patient has hypertension (boolean)
platelets Platelets in the blood (kiloplatelets/mL)
serum_creatinine Level of serum creatinine in the blood (mg/dL)
serum_sodium Level of serum sodium in the blood (mEq/L)
sex Woman or man (binary)
smoking If the patient smokes or not (boolean)
time Follow-up period (days)
DEATH_EVENT If the patient deceased during the follow-up period (boolean)
import pandas as pd
import numpy as np
import klib
import seaborn as sns
from colorama import Fore, Back, Style
import pandas_profiling as pp
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from statsmodels.formula.api import ols
import plotly.graph_objs as gobj
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
%matplotlib
Using matplotlib backend: Qt5Agg
# importing dataset
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
df
| age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
| 1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
| 2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
| 3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
| 4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 294 | 62.0 | 0 | 61 | 1 | 38 | 1 | 155000.00 | 1.1 | 143 | 1 | 1 | 270 | 0 |
| 295 | 55.0 | 0 | 1820 | 0 | 38 | 0 | 270000.00 | 1.2 | 139 | 0 | 0 | 271 | 0 |
| 296 | 45.0 | 0 | 2060 | 1 | 60 | 0 | 742000.00 | 0.8 | 138 | 0 | 0 | 278 | 0 |
| 297 | 45.0 | 0 | 2413 | 0 | 38 | 0 | 140000.00 | 1.4 | 140 | 1 | 1 | 280 | 0 |
| 298 | 50.0 | 0 | 196 | 0 | 45 | 0 | 395000.00 | 1.6 | 136 | 1 | 1 | 285 | 0 |
299 rows × 13 columns
profile =pp.ProfileReport(df, title ="Heart Failure Profile", html={"style": {"full_width": True}})
profile
df.dtypes
age float64 anaemia int64 creatinine_phosphokinase int64 diabetes int64 ejection_fraction int64 high_blood_pressure int64 platelets float64 serum_creatinine float64 serum_sodium int64 sex int64 smoking int64 time int64 DEATH_EVENT int64 dtype: object
# plotting and analysis age variables
df_hist = [df["age"].values]
groupBylabels = ['age']
fig = ff.create_distplot(df_hist, groupBylabels)
fig.update_layout(title_text = "Age Distribution plot")
# Creating boxplot for Age variable and setting gender into male= 1, and female = 0
fig = px.box(df, x = "sex", y="age", points="outliers")
fig.update_layout(title = "Male = 1, Female = 0")
fig.show()
male = df[df["sex"]==1]
female = df[df["sex"]==0]
male_survi = male[df["DEATH_EVENT"]==0]
male_not = male[df["DEATH_EVENT"]==1]
female_survi = female[df["DEATH_EVENT"]==0]
female_not = female[df["DEATH_EVENT"]==1]
labels = ['Male - Survived','Male - Not Survived', "Female - Survived", "Female - Not Survived"]
values = [len(male[df["DEATH_EVENT"]==0]),len(male[df["DEATH_EVENT"]==1]),
len(female[df["DEATH_EVENT"]==0]),len(female[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Survival - Gender")
fig.show()
surv = df[df["DEATH_EVENT"]==0]["age"]
not_surv = df[df["DEATH_EVENT"]==1]["age"]
hist_data = [surv, not_surv]
group_labels = ["Survived", "Not Survived"]
fig = ff.create_distplot(hist_data,group_labels, bin_size=0.5)
fig.update_layout(
title_text = "Analysis in Age on Survival Status")
fig.show()
A close look at this digram aboove shows that the Not survival rate cut cross all groups. And also, survival is high on 40 to 80
fig = px.violin(df, y="age", x="sex", color="DEATH_EVENT", box=True, points="all", hover_data=df.columns)
fig.update_layout(title_text="Analysis in Age and Gender on Survival Status")
fig.show()
Age Report
Survival spread is high in age's flow of 40 to 70 The Survival is high for both male between 50 to 60 and female's age between 60 to 70 respectively
fig = px.violin(df, y="age", x="smoking", color="DEATH_EVENT", box=True, points="all", hover_data=df.columns)
fig.update_layout(title_text="Analysis in Age and Smoking on Survival Status")
fig.show()
fig = px.violin(df, y="age", x="diabetes", color="DEATH_EVENT", box=True, points="all", hover_data=df.columns)
fig.update_layout(title_text="Analysis in Age and Diabetes on Survival Status")
fig.show()
Histogram plots
fig = px.histogram(df, x = "creatinine_phosphokinase", color = "DEATH_EVENT", marginal="violin", hover_data = df.columns)
fig.show()
fig = px.histogram(df, x="ejection_fraction", color="DEATH_EVENT", marginal="violin", hover_data=df.columns)
fig.show()
fig = px.histogram(df, x="platelets", color="DEATH_EVENT", marginal="violin", hover_data=df.columns)
fig.show()
fig = px.histogram(df, x="serum_creatinine", color="DEATH_EVENT", marginal="violin", hover_data=df.columns)
fig.show()
fig = px.histogram(df, x="serum_sodium", color="DEATH_EVENT", marginal="violin",hover_data=df.columns)
fig.show()
surv = df[df['DEATH_EVENT']==0]['serum_sodium']
not_surv = df[df['DEATH_EVENT']==1]['serum_sodium']
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.5)
fig.update_layout(
title_text="Analysis in Serum Sodium on Survival Status")
fig.show()
surv = df[df['DEATH_EVENT']==0]['serum_creatinine']
not_surv = df[df['DEATH_EVENT']==1]['serum_creatinine']
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.5)
fig.update_layout(
title_text="Analysis in Serum Creatinine on Survival Status")
fig.show()
surv = df[df['DEATH_EVENT']==0]['ejection_fraction']
not_surv = df[df['DEATH_EVENT']==1]['ejection_fraction']
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.5)
fig.update_layout(
title_text="Analysis in Ejaction Fraction on Survival Status")
fig.show()
Pie Charts
labels = ['No Diabetes','Diabetes']
diabetes_yes = df[df['diabetes']==1]
diabetes_no = df[df['diabetes']==0]
values = [len(diabetes_no), len(diabetes_yes)]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Diabetes")
fig.show()
From the above pie chart i can conclude that in i the dataset 58.2% (174) pepole are No Diabetes Person and 41.8% (125) are Diabetes Person.
fig = px.pie(df, values='diabetes',names='DEATH_EVENT', title='Diabetes Death Event Ratio')
fig.show()
diabetes_yes_survi = diabetes_yes[df["DEATH_EVENT"]==0]
diabetes_yes_not_survi = diabetes_yes[df["DEATH_EVENT"]==1]
diabetes_no_survi = diabetes_no[df["DEATH_EVENT"]==0]
diabetes__no_not_survi = diabetes_no[df["DEATH_EVENT"]==1]
labels = ['Diabetes Yes - Survived','Diabetes Yes - Not Survived', 'Diabetes NO - Survived', 'Diabetes NO - Not Survived']
values = [len(diabetes_yes[df["DEATH_EVENT"]==0]),len(diabetes_yes[df["DEATH_EVENT"]==1]),
len(diabetes_no[df["DEATH_EVENT"]==0]),len(diabetes_no[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Survival - Diabetes")
fig.show()
anaemia_yes = df[df['anaemia']==1]
anaemia_no = df[df['anaemia']==0]
labels = ['No Anaemia', 'Anaemia']
values = [len(anaemia_no), len(anaemia_yes)]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on - Anaemia")
fig.show()
From the above pie chart we can conclude that in our dataset 56.9%(170) pepoles are Non anaemic pepoles, while 43.1%(129) are anaemic pepoles.
fig = px.pie(df, values='anaemia',names='DEATH_EVENT', title='Anaemia Death Event Ration')
fig.show()
anaemia_yes_survi = anaemia_yes[df["DEATH_EVENT"]==0]
anaemia_yes_not_survi = anaemia_yes[df["DEATH_EVENT"]==1]
anaemia_no_survi = anaemia_no[df["DEATH_EVENT"]==0]
anaemia_no_not_survi = anaemia_no[df["DEATH_EVENT"]==1]
labels = ['Anaemia Yes - Survived','Anaemia Yes - Not Survived', 'Anaemia No - Survived', 'Anaemia NO - Not Survived']
values = [len(anaemia_yes[df["DEATH_EVENT"]==0]),len(anaemia_yes[df["DEATH_EVENT"]==1]),
len(anaemia_no[df["DEATH_EVENT"]==0]),len(anaemia_no[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Survival - Anaemia")
fig.show()
hbp_yes = df[df['high_blood_pressure']==1]
hbp_no = df[df['high_blood_pressure']==0]
labels = ["No High BP","High BP"]
values = [len(hbp_no), len(hbp_yes)]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on - High Blood Pressure")
fig.show()
From the above pie chart we can conclude that in our dataset 64.9%(194) pepoles are Non High BP pepoles, while 35.1%(105) are High BP pepoles.
fig = px.pie(df, values='high_blood_pressure',names='DEATH_EVENT', title='High Blood Pressure Death Event Ratio')
fig.show()
hbp_yes_survi = hbp_yes[df["DEATH_EVENT"]==0]
hbp_yes_not_survi = hbp_yes[df["DEATH_EVENT"]==1]
hbp_no_survi = hbp_no[df["DEATH_EVENT"]==0]
hbp_no_not_survi = hbp_no[df["DEATH_EVENT"]==1]
labels = ['HBP Yes - Survived','HBP Yes - Not Survived', 'HBP No - Survived', 'HBP NO - Not Survived']
values = [len(hbp_yes[df["DEATH_EVENT"]==0]),len(hbp_yes[df["DEATH_EVENT"]==1]),
len(hbp_no[df["DEATH_EVENT"]==0]),len(hbp_no[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Survival - HBP(high blood pressure)")
fig.show()
smoking_yes = df[df['smoking']==1]
smoking_no = df[df['smoking']==0]
labels = ['No Smoking','Smoking']
values = [len(smoking_no), len(smoking_yes)]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on - Smoking")
fig.show()
From the above pie chart we can conclude that in our dataset 67.9%(203) pepoles are Non Smoking pepoles, while 32.1%(96) are Smoking pepoles.
fig = px.pie(df, values='smoking',names='DEATH_EVENT', title='Smoking Death Event Ratio')
fig.show()
smoking_yes_survi = smoking_yes[df["DEATH_EVENT"]==0]
smoking_yes_not_survi = smoking_yes[df["DEATH_EVENT"]==1]
smoking_no_survi = smoking_no[df["DEATH_EVENT"]==0]
smoking_no_not_survi = smoking_no[df["DEATH_EVENT"]==1]
labels = ['Smoking Yes - Survived','Smoking Yes - Not Survived', 'Smoking No - Survived', 'Smoking NO- Not Survived']
values = [len(smoking_yes[df["DEATH_EVENT"]==0]),len(smoking_yes[df["DEATH_EVENT"]==1]),
len(smoking_no[df["DEATH_EVENT"]==0]),len(smoking_no[df["DEATH_EVENT"]==1])]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
fig.update_layout(
title_text="Analysis on Survival - Smoking")
fig.show()
From above pie charts we can conclude that in our dataset diabetes from 203 of Non Smoking person 137 are survived and 66 are not survived and From 96 Smoking person 66 are survived, while 30 are not survived.
Heatmap
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), vmin=-1, cmap='coolwarm', annot=True)
<AxesSubplot:>
Features = ['time','ejection_fraction','serum_creatinine']
X = df[Features]
y = df["DEATH_EVENT"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
names = [
"CatBoostClassifier",
"Logistic Regression",
"Support Vector Machine",
"Decision Tree",
"Neural Network",
"Random Forest",
"XGBoost",
"LGBMClassifier",
"XGBRFClassifier",
"GradientBoosting",
"GaussianNB",
"KNeighborsClassifier"
]
models = [
CatBoostClassifier(verbose= False),
LogisticRegression(),
SVC(),
DecisionTreeClassifier(),
MLPClassifier(),
RandomForestClassifier(),
XGBClassifier(),
lightgbm.LGBMClassifier(max_depth=2, random_state=4),
xgboost.XGBRFClassifier(max_depth=3, random_state=1),
GradientBoostingClassifier(max_depth=2, random_state=1),
GaussianNB(),
KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
]
accuracy=[]
for model, name in zip(models,names):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Confusion matrix of ',name)
print(confusion_matrix(y_test, y_pred))
ac = accuracy_score(y_test, y_pred)
print('Accuracy score is ',ac)
accuracy.append(ac)
print('='*50)
Accuracy_list = pd.DataFrame(list(zip(names, accuracy)),columns =['Model', 'Accuracy'])
Accuracy_list= Accuracy_list.sort_values('Accuracy', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last', ignore_index=True, key=None)
Confusion matrix of CatBoostClassifier [[49 4] [15 22]] Accuracy score is 0.7888888888888889 ================================================== Confusion matrix of Logistic Regression [[49 4] [17 20]] Accuracy score is 0.7666666666666667 ================================================== Confusion matrix of Support Vector Machine [[52 1] [20 17]] Accuracy score is 0.7666666666666667 ================================================== Confusion matrix of Decision Tree [[46 7] [17 20]] Accuracy score is 0.7333333333333333 ================================================== Confusion matrix of Neural Network [[53 0] [37 0]] Accuracy score is 0.5888888888888889 ================================================== Confusion matrix of Random Forest [[49 4] [14 23]] Accuracy score is 0.8 ================================================== Confusion matrix of XGBoost [[48 5] [16 21]] Accuracy score is 0.7666666666666667 ================================================== Confusion matrix of LGBMClassifier [[50 3] [13 24]] Accuracy score is 0.8222222222222222 ================================================== Confusion matrix of XGBRFClassifier [[51 2] [16 21]] Accuracy score is 0.8 ================================================== Confusion matrix of GradientBoosting [[50 3] [14 23]] Accuracy score is 0.8111111111111111 ================================================== Confusion matrix of GaussianNB [[52 1] [21 16]] Accuracy score is 0.7555555555555555 ================================================== Confusion matrix of KNeighborsClassifier [[51 2] [17 20]] Accuracy score is 0.7888888888888889 ==================================================
plt.rcParams['figure.figsize']=20,6
sns.set_style("darkgrid")
ax = sns.barplot(x = 'Model',y = 'Accuracy',data = Accuracy_list , palette = "rocket", saturation =1.5)
plt.xlabel("Model", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Models", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()